"""
去哪儿网反爬虫
爬取目标站点去哪儿网航班信息：
https://flight.qunar.com/site/oneway_list.htm?searchDepartureAirport=%E5%8C%97%E4%BA%AC&searchArrivalAirport=%E4%B8%8A%E6%B5%B7&searchDepartureTime=2021-10-17&searchArrivalTime=2021-10-19&nextNDays=0&startSearch=true&fromCode=BJS&toCode=SHA&from=flight_dom_search&lowestPrice=null
"""
import re
import json
from lxml import etree

parser = etree.HTMLParser(encoding='utf-8')
html = etree.parse('./qunar.html', parser=parser)
flights = html.xpath('//*[contains(@class, "b-airfly")]')   # todo 获取所有航班信息
flight_info = []    # todo 航班信息

def deal_price_ele(eles):
    """处理价格标签,提取偏移量和数字"""
    alternate_price = []
    for item in eles:
        text = item.xpath('.//text()')[0].strip()
        style = item.xpath('.//@style')[0]
        position = "".join(re.findall(re.compile(r'.*left:(.*?)px.*', re.DOTALL), style))
        position = int(position)
        width = "".join(re.findall(re.compile(r'.*width:(.*?)px.*', re.DOTALL), style))
        width = int(width)
        index = int(position / width)
        alternate_price.append(dict(position=position, text=text, index=index))
    return alternate_price

for element in flights:
    air_name = element.xpath('.//*[contains(@class, "air")]//span//text()')[0].strip()  # todo 航班名称
    from_addr = "".join(element.xpath('.//*[contains(@class, "sep-lf")]//*[contains(@class, "airport")]//text()')).strip()  # todo 出发地
    to_addr = "".join(element.xpath('.//*[contains(@class, "sep-rt")]//*[contains(@class, "airport")]//text()')).strip()    # 目的地
    price_ele = element.xpath('.//*[contains(@class, "fix_price")]//*[contains(@class, "rel")]//b')     # todo 价格
    b1 = price_ele.pop(0)
    base_price = b1.xpath('.//i//text()')
    alternate_price = deal_price_ele(price_ele)
    for item in alternate_price:
        text = item.get('text')
        index = item.get('index')
        base_price[index] = text
    price = float("".join(base_price))
    flight_info.append(dict(air_name=air_name, from_addr=from_addr, to_addr=to_addr, price=price))

print(json.dumps(flight_info, ensure_ascii=False, indent=4))